##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## R code to perform the main empirical analyses for 
##  "Every Story Has a Beginning, Middle, and an End (But Not Always in That Order): 
##   Predicting duration dynamics in a unified framework"
##
##  Authors: Daina Chiba, Nils W. Metternich, and Michael D. Ward
##
##  Last modified: 2014-08-22 
##  Contact Daina Chiba (daina.chiba@gmail.com) for any questions.
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# This script produces 
#
#  (1) numerical results reported in Table 2 (Table2.txt)
#
#  (2) Figure 6: In-sample prediction assessment:
#      Figure6-1_pred_absdiff.pdf
#      Figure6-2_pred_diff.pdf
#

# > sessionInfo()
# R version 3.1.1 (2014-07-10)
# Platform: x86_64-apple-darwin13.1.0 (64-bit)


## Set the working directory if necessary

# setwd("your_path_to_the_folder")


# preparation -------------------------------------------------------------

rm(list=ls())
require(SPDDD)
data(ucdp.triad)

# Equations ---------------------------------------------------------------

## Pre-conflict peace duration equation
eq.p1 <- t1 ~ lnrgdpch1l + xpolity + xpolity.sq + lnpopl + diamond + onshore + excluded.1r

## Post-conflict peace duration equation
eq.p2 <- t1 ~ lnrgdpch1l + prev.VicReb + prev.VicGov + prev.PA + prev.TerIncomp + lnpopl + 
  xpolity + xpolity.sq + diamond + onshore + excluded.1r

## Conflict duration equation
eq.c2 <- t1 ~ lnrgdpch1l + lnpopl+multi + War + TerIncomp + xpolity + xpolity.sq + 
  diamond + onshore + cold + excluded.1r

## Immunity equation
eq.cure <- rc ~ lnrgdpch1l + xpolity + xpolity.sq + lnpopl + excluded.1r


# Estimation --------------------------------------------------------------

model <- estimate.SPDDD( eq.p1, eq.c2, eq.p2, eq.cure, 
                         distr1="weibull", distr2="weibull", data = ucdp.triad )

summary( model )

## Coefficients (Table 2)

# Column 1 (Simple)
summary( model ) $ Independent.Model

# Column 2 (SUD)
summary( model ) $ SUR.Model

# Column 3 (SP)
summary( model ) $ Independent.Split.Model

# Column 4 (SPDDD)
summary( model ) $ SP.DDD

## Log-likelihood values for the four models (bottom of Table 2)

model $ model.ind $ value
model $ model.sur $ value
model $ model.spj $ value
model $ model.spd $ value

### AIC for the four models (bottom of Table 2)

2 * (length( model $ model.ind $ par ) + model $ model.ind $ value)
2 * (length( model $ model.sur $ par ) + model $ model.sur $ value)
2 * (length( model $ model.spj $ par ) + model $ model.spj $ value)
2 * (length( model $ model.spd $ par ) + model $ model.spd $ value)


# Print the table ---------------------------------------------------------

sink("Table2.txt")
print("Table 2")
print("======================================================================")
print("Column 1 (Simple)")
print(summary( model ) $ Independent.Model)
print("======================================================================")
print("Column 2 (SUD)")
print(summary( model ) $ SUR.Model)
print("======================================================================")
print("Column 3 (SP)")
print(summary( model ) $ Independent.Split.Model)
print("======================================================================")
print("Column 4 (SPDDD)")
print(summary( model ) $ SP.DDD)
print("======================================================================")
print("Log-likelihood values")
print("Column 1 (Simple)")
print(model $ model.ind $ value)
print("Column 2 (SUD)")
print(model $ model.sur $ value)
print("Column 3 (SP)")
print(model $ model.spj $ value)
print("Column 4 (SPDDD)")
print(model $ model.spd $ value)
print("======================================================================")
print("AIC values")
print("Column 1 (Simple)")
print(2 * (length( model $ model.ind $ par ) + model $ model.ind $ value))
print("Column 2 (SUD)")
print(2 * (length( model $ model.sur $ par ) + model $ model.sur $ value))
print("Column 3 (SP)")
print(2 * (length( model $ model.spj $ par ) + model $ model.spj $ value))
print("Column 4 (SPDDD)")
print(2 * (length( model $ model.spd $ par ) + model $ model.spd $ value))
sink()


# In-sample prediction ----------------------------------------------------

## Generate prediction data
failed.p1 <- ucdp.triad $ p1.data[ ucdp.triad $ p1.data $ rc == 0 & ucdp.triad $ p1.data $ lo == 1, ]
failed.c2 <- ucdp.triad $ c2.data[ ucdp.triad $ c2.data $ rc == 0 & ucdp.triad $ c2.data $ lo == 1, ]
failed.p2 <- ucdp.triad $ p2.data[ ucdp.triad $ p2.data $ rc == 0 & ucdp.triad $ p2.data $ lo == 1, ]
pred.data <- list(p1.data = failed.p1, c2.data = failed.c2, p2.data = failed.p2)

## Use the first observation per spell
p1.spells <- unique( failed.p1 $ spellID )
p2.spells <- unique( failed.p2 $ spellID )
c2.spells <- unique( failed.c2 $ spellID )

failed.p1.f <- ucdp.triad $ p1.data[ 
  ucdp.triad $ p1.data $ spellID %in% p1.spells & 
    ucdp.triad $ p1.data $ fo == 1, ]
failed.p2.f <- ucdp.triad $ p2.data[ 
  ucdp.triad $ p2.data $ spellID %in% p2.spells & 
    ucdp.triad $ p2.data $ fo == 1, ]
failed.c2.f <- ucdp.triad $ c2.data[ 
  ucdp.triad $ c2.data $ spellID %in% c2.spells & 
    ucdp.triad $ c2.data $ fo == 1, ]
failed.p1.f $ t1 <- failed.p1 $ t1
failed.p2.f $ t1 <- failed.p2 $ t1
failed.c2.f $ t1 <- failed.c2 $ t1
pred.data.f <- list(p1.data = failed.p1.f, c2.data = failed.c2.f, p2.data = failed.p2.f)

## auxiliary function for in-sample prediction (calculates Weibull parameters)
pred.w.w.ll <- function(theta, X.p1, X.c2, X.p2){
  
  ## Extract Parameters
  beta.p1 <- theta[ 1 : ncol(X.p1) ]
  beta.c2 <- theta[ (ncol(X.p1)+1) : (ncol(X.p1) + ncol(X.c2)) ]
  beta.p2 <- theta[ (ncol(X.p1) + ncol(X.c2) + 1) : (ncol(X.p1) + ncol(X.c2) + ncol(X.p2))]
  p.1 <- exp(theta[ncol(X.p1) + ncol(X.c2) + ncol(X.p2) + 1])
  p.2 <- exp(theta[ncol(X.p1) + ncol(X.c2) + ncol(X.p2) + 2])
  p.3 <- exp(theta[ncol(X.p1) + ncol(X.c2) + ncol(X.p2) + 3])
  
  ## Lambda = exp(-xb)
  lambda.1 <-  exp( -X.p1 %*% beta.p1 )	# pre-conflict peace
  lambda.2 <-  exp( -X.c2 %*% beta.c2 )	# conflict
  lambda.3 <-  exp( -X.p2 %*% beta.p2 )	# post-conflict peace
  
  ## Median time
  et.1 <- lambda.1^(-1) * log(2)^(1/p.1)
  et.2 <- lambda.2^(-1) * log(2)^(1/p.2)
  et.3 <- lambda.3^(-1) * log(2)^(1/p.3)
  
  ## Mean time
  et.1 <- lambda.1^(-1) * gamma( 1 + 1/p.1 )
  et.2 <- lambda.2^(-1) * gamma( 1 + 1/p.2 )
  et.3 <- lambda.3^(-1) * gamma( 1 + 1/p.3 )
  
  out <- list(et.1 = et.1, et.2 = et.2, et.3 = et.3)
  return(out)
}

## auxiliary function for in-sapmle prediction (calculates predicted duration)
pred.time <- function(model, data){
  
  ## extract parameters
  par.ind <- model$model.ind$par
  par.spd <- model$model.spd$par
  
  p1.data <- data$p1.data; spell.id.p1 <- p1.data$spellID
  c2.data <- data$c2.data; spell.id.c2 <- c2.data$spellID
  p2.data <- data$p2.data; spell.id.p2 <- p2.data$spellID
  
  a.p1 <- as.character(model$formula.p1)
  a.c2 <- as.character(model$formula.c2)
  a.p2 <- as.character(model$formula.p2)
  a.i  <- as.character(model$formula.i)
  lhb.p1 <- a.p1[2]; lhb.p2 <- a.p2[2]; lhb.c2 <- a.c2[2]; lhb.i <- a.i[2]
  rhb.p1 <- strsplit(a.p1[3], split=" + ", fixed=T)[[1]]
  rhb.c2 <- strsplit(a.c2[3], split=" + ", fixed=T)[[1]]
  rhb.p2 <- strsplit(a.p2[3], split=" + ", fixed=T)[[1]]
  rhb.i  <- strsplit(a.i[3],  split=" + ", fixed=T)[[1]]
  X.p1 <- p1.data[rhb.p1]; X.p1 <- as.matrix(cbind(1, X.p1))
  X.c2 <- c2.data[rhb.c2]; X.c2 <- as.matrix(cbind(1, X.c2))
  X.p2 <- p2.data[rhb.p2]; X.p2 <- as.matrix(cbind(1, X.p2))
  X.i1 <- p1.data[rhb.i ]; X.i1 <- as.matrix(cbind(1, X.i1))
  X.i2 <- p2.data[rhb.i ]; X.i2 <- as.matrix(cbind(1, X.i2))
  Y.p1 <- cbind(p1.data[lhb.p1], p1.data$t0, p1.data$rc, p1.data$lo)
  Y.c2 <- cbind(c2.data[lhb.c2], c2.data$t0, c2.data$rc, c2.data$lo, c2.data$prev.dur)
  Y.p2 <- cbind(p2.data[lhb.p2], p2.data$t0, p2.data$rc, p2.data$lo, p2.data$prev.dur)
  
  et.spd <- pred.w.w.ll(par.spd, X.p1 = X.p1, X.c2 = X.c2, X.p2 = X.p2)
  et.ind <- pred.w.w.ll(par.ind, X.p1 = X.p1, X.c2 = X.c2, X.p2 = X.p2)
  
  ## bind them
  p1 <- data.frame(observed = Y.p1$t1, spd = et.spd$et.1, ind = et.ind$et.1)
  c2 <- data.frame(observed = Y.c2$t1, spd = et.spd$et.2, ind = et.ind$et.2)
  p2 <- data.frame(observed = Y.p2$t1, spd = et.spd$et.3, ind = et.ind$et.3)
  
  out <- list(p1 = p1, c2 = c2, p2 = p2)
  return(out)
}

pred.insample <- pred.time(model, pred.data.f)


# RMSE estimates ----------------------------------------------------------

## pre-conflict
pred.insample$p1 $ se.spd <- (pred.insample$p1$spd - pred.insample$p1$observed)^2
pred.insample$p1 $ se.ind <- (pred.insample$p1$ind - pred.insample$p1$observed)^2

sqrt(mean(pred.insample$p1 $ se.spd)) # RMSE for SP-DDD
sqrt(mean(pred.insample$p1 $ se.ind)) # RMSE for simple Weibull

## post-conflict
pred.insample$p2 $ se.spd <- (pred.insample$p2$spd - pred.insample$p2$observed)^2
pred.insample$p2 $ se.ind <- (pred.insample$p2$ind - pred.insample$p2$observed)^2

sqrt(mean(pred.insample$p2 $ se.spd)) # RMSE for SP-DDD
sqrt(mean(pred.insample$p2 $ se.ind)) # RMSE for simple Weibull

## conflict
pred.insample$c2 $ se.spd <- (pred.insample$c2$spd - pred.insample$c2$observed)^2
pred.insample$c2 $ se.ind <- (pred.insample$c2$ind - pred.insample$c2$observed)^2

sqrt(mean(pred.insample$c2 $ se.spd)) # RMSE for SP-DDD
sqrt(mean(pred.insample$c2 $ se.ind)) # RMSE for simple Weibull



# Compare two predictions -------------------------------------------------

pred.res <- rbind( 
  pred.insample $ p1 [ , c("observed","spd","ind") ],
  pred.insample $ p2 [ , c("observed","spd","ind") ],
  pred.insample $ c2 [ , c("observed","spd","ind") ]
)

pred.res $ abs.diff.spd <- abs( pred.res $ observed - pred.res $ spd )
pred.res $ abs.diff.ind <- abs( pred.res $ observed - pred.res $ ind )
pred.res $ spd.better <- pred.res $ abs.diff.spd < pred.res $ abs.diff.ind

## Case-wise comparison
table( pred.res $ spd.better ) # IS SPDDD prediction better?


# Figure 6 (left): absolute deviation -------------------------------------

pred.res <- pred.res[ rev(order(pred.res $ observed)), ]

range( pred.res $ abs.diff.spd )
range( pred.res $ abs.diff.ind )
range( pred.res $ observed )

# greater than 50 years?
table( pred.res $ abs.diff.spd > 50 * 365)
table( pred.res $ abs.diff.ind > 50 * 365)

# greater than 25 years?
table( pred.res $ abs.diff.spd > 25 * 365)
table( pred.res $ abs.diff.ind > 25 * 365)

## plot
x.rng <- c(-45000 - 500, 45000 + 500)
y.rng <- c(0, 60*365)

pdf(file="Figure6-1_pred_absdiff.pdf", width=5, height=5)
par(mar=c(2.5,2.5,.5,.5)+0.1,cex.lab=1,cex.axis=0.8,
    mgp=c(1.5,0.5,0), cex.main=1, cex=0.8,bg="white")
plot(x.rng, y.rng, axes=F, type="n", xlab="Absolute deviation of estimated duration (years)",
     ylab="Observed duration (years)")
axis(1, at = c(-36500-500, -18250-500, 0, 18250+500, 36500+500), labels=c(100,50,0,50,100) )
axis(2, at = 365*seq(from=0, to=60, by=10), labels=c("0","10","20","30","40","50","60") )
for( i in 1: nrow(pred.res) ){
  # SPDDD
  lines( 
    x = c(500, pred.res $ abs.diff.spd[i] + 500), 
    y = c( pred.res$observed[i], pred.res$observed[i]) )
  
  # Naive	
  lines( 
    x = c(-500, - pred.res $ abs.diff.ind[i] - 500), 
    y = c( pred.res$observed[i], pred.res$observed[i]), col="gray")
}
legend(x=50*365, y=58*365, legend=c("SPDDD","Simple"), 
       lty=c("solid","solid"),col=c("black","gray"), cex=1)
dev.off()


# Figure 6 (right): difference in deviation -------------------------------

pred.res $ diff.dev <- pred.res $ abs.diff.spd - pred.res $ abs.diff.ind

x.rng <- c(-40000-500, 40000+500)
y.rng <- c(0, 365 * 60)

pdf(file="Figure6-2_pred_diff.pdf", width=5, height=5)
par(mar=c(2.5,2.5,.5,.5)+0.1,cex.lab=1,cex.axis=0.8,
    mgp=c(1.5,0.5,0), cex.main=1, cex=0.8,bg="white")
plot(x.rng, y.rng, axes=F, type="n", xlab="Difference in absolute deviation (years)",
     ylab="Observed duration (years)")
axis(1, at = c(-36500-500, -18250-500, 0, 18250+500, 36500+500), labels=c(100,50,0,50,100) )
axis(2, at = 365* seq(from=0, to=60, by=10), labels=c("0","10","20","30","40","50","60") )

for( i in 1: nrow(pred.res) ){
  if ( pred.res $ diff.dev[i] > 0 ){
    lines( 
      x = c(500, 500 + pred.res $ diff.dev[i] ), 
      y = c( pred.res$observed[i], pred.res$observed[i]) )
  } else{
    lines( 
      x = c(-500, -500 + pred.res $ diff.dev[i] ), 
      y = c( pred.res$observed[i], pred.res$observed[i]), col="gray")
  }
}
legend(x=50*365, y=58*365, legend=c("SPDDD","Simple"), 
       lty=c("solid","solid"),col=c("black","gray"), cex=1)
dev.off()


# End of file -------------------------------------------------------------
